################################################################################################
rm(list=ls()) 
x=c("rgdal","rgeos","sp","geosphere","tidyr","broom","ggplot2")

lapply(x, library, character.only=TRUE) # load the required packages
library(dplyr)
library(data.table)

setwd("Derived Data")

# units are in MwH

# Import plant information
dfcoal=read.csv("dfcoal.csv", stringsAsFactors = FALSE)
dfcoalprod=read.csv("dfcoalprod.csv", stringsAsFactors = FALSE)

finalfuelagg=read.csv("finalfuelagg.csv", stringsAsFactors = FALSE)

aggprod=read.csv("aggprod.csv", stringsAsFactors = FALSE)


preparedataset <- function(distance){    
    ###################################### Match to district ############## #######################################################
    
    # convert coal df to spatial points 
    
    coordinates(dfcoal)=cbind(dfcoal$Longitude,dfcoal$Latitude) # set the coordinates 
    proj4string(dfcoal)=CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
    
    #### Import school district
    setwd("../Raw Data/6. SEDA DATA")
    
    sch_dist=readOGR(dsn="seda_shapefiles_2019_4.0", layer="seda_shapefiles_2019_4")
    
    # only keep necessary columns:
    sch_dist@data=sch_dist@data[, c("NAME","GEOID")]
    
    # GEOID IS a character of length 7
    # turn into numeric
    sch_dist@data$leaid=as.numeric(sch_dist@data$GEOID) 
    
    # create centroid of school district
    centroid_sch=gCentroid(sch_dist, byid=TRUE) 
    # create spdf with centroid:
    
    schdist_centr=SpatialPointsDataFrame(centroid_sch,data=sch_dist@data) 
    
    schdist_centr=spTransform(schdist_centr, proj4string(dfcoal))
    
    rm(centroid_sch)
    # want to get sum of district by year pollution : 

    dfout=lapply(1:nrow(dfcoal@data), function(ind){
      
      pt=dfcoal[ind,]
      
      distvec=distGeo(pt,schdist_centr)/1000 # distance from the coal plant to the school district
      
      distradius=which(distvec<distance) # this part establishes the distance
      if(length(distradius)>0){
        schdist_radius=schdist_centr[distradius,]
        
        schdist_radius$distveckm=distGeo(pt, schdist_radius)/1000  # this is distance vec in km
        
        # create mini dataframe for this coal plant
        ptrep=pt[rep(seq_len(nrow(pt)), each = nrow(schdist_radius@data)), ] 
        
        # final dataframe
        dfout=data.frame(ptrep@data, 
                         schdist_radius@data)
        
        
        return(dfout)
      }
      
      
    })
    
    
    plant_dist=do.call(rbind,dfout)
    
    
    ##################################################
    # final merge production with district info
    # coalprod: plant x year production; # plant_dist: stacked plant_district match
    
    
    gc()
    prod_dist_year=merge(dfcoalprod,plant_dist,by=c("PlantID","State")) # checked 
    # CLEANING
    rm(dfcoal,dfcoalprod,dfout,schdist_centr, plant_dist)
    
  
    
    prod_dist_year=prod_dist_year[,c("leaid","year","AY_prod","fuel","distveckm","PlantID","State")]
    

    
    yearlyproduction_coal=prod_dist_year %>% filter(fuel=="coal") %>% group_by(leaid,year) %>% summarize(annualcoal=sum(AY_prod)) 
    yearlyproduction_gas=prod_dist_year %>% filter(fuel=="gas") %>% group_by(leaid,year) %>% summarize(annualgas=sum(AY_prod))

    yearlyproduction_oil=prod_dist_year %>% filter(fuel=="oil") %>% group_by(leaid,year) %>% summarize(annualoil=sum(AY_prod))

    yearlyproduction_renewable=prod_dist_year %>% filter(fuel=="renewable") %>% group_by(leaid,year) %>% summarize(annualrenewable=sum(AY_prod))

    allyear=merge(yearlyproduction_coal,yearlyproduction_gas,by=c("leaid","year"),all=T)
    allyear=merge(allyear,yearlyproduction_oil,by=c("leaid","year"),all=T)
    allyear=merge(allyear,yearlyproduction_renewable,by=c("leaid","year"),all=T)
    
    
    # fill in all years from 2009 to 2018
    year=c(2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018)
    allyearlist=lapply(1:nrow(sch_dist@data), function(ind){
      
      pt=sch_dist@data[ind,]
      leaid=rep(pt$leaid,length(year))
      
      dfout=data.frame(leaid,year)
      
      
    })
    allprodleaid_to_fill=do.call(rbind,allyearlist) # checked
    
    # merge in dfleaid_prod:
    allyear=merge(allyear,allprodleaid_to_fill,by=c("leaid","year"),all=TRUE)

    #missing means zero production
    allyear$annualcoal[is.na(allyear$annualcoal)]=0
    allyear$annualgas[is.na(allyear$annualgas)]=0
    allyear$annualoil[is.na(allyear$annualoil)]=0
    allyear$annualrenewable[is.na(allyear$annualrenewable)]=0
    
    
    # the lags should be built here:
    allyear=allyear %>% group_by(leaid)%>% mutate(annualcoallag=dplyr::lag(annualcoal, n=1, order_by=year),
                               annualgaslag=dplyr::lag(annualgas, n=1, order_by=year),
                               annualoillag=dplyr::lag(annualoil, n=1, order_by=year),
                               annualrenewablelag=dplyr::lag(annualrenewable, n=1, order_by=year))
    
    #create a table for all years : units are in MwH
    if (distance==40){
      distanalysis=prod_dist_year %>% group_by(leaid,fuel,year) %>% summarise(ayprodleaid=sum(AY_prod)) %>% ungroup()
      fwrite(distanalysis,"../../Derived Data/distanalysisleaid.csv")
    }
  
    
    # final sum of leaid by year production by distance bin: this dataframe only has positive production: summarize over plants
    dfleaid_prod=prod_dist_year%>% filter(year==2005) %>% group_by(leaid,fuel) %>% summarise(ayprodleaid=sum(AY_prod)) %>% ungroup()
    # only need 2005 production: 
    
    ####################### Fill in database: fuel type by leaid
    fuel=c("coal","gas","oil","renewable")
    schdistlist=lapply(1:nrow(sch_dist@data), function(ind){
      
      pt=sch_dist@data[ind,]
       leaid=rep(pt$leaid,length(fuel))
      
      dfout=data.frame(leaid,fuel)
      
      
    })
    
    leaid_to_fill=do.call(rbind,schdistlist) # checked
    
    # merge in dfleaid_prod:
    dfleaid_prod=merge(dfleaid_prod,leaid_to_fill,by=c("leaid","fuel"),all=TRUE)
    
    # if NA: zero prod
    dfleaid_prod$ayprodleaid[is.na(dfleaid_prod$ayprodleaid)]=0  # this is a dataframe of ALL districts x ALL distance bins
    ##############################
    # CALCULATE: 2005 SHARES
    agg2005=aggprod[aggprod$year==2005,]; agg2005$year=NULL
    
    prod2005shares=merge(dfleaid_prod,agg2005,by="fuel")
    
    prod2005shares$ayproduction2005=prod2005shares$ayprodleaid
    prod2005shares=prod2005shares[,c("fuel","leaid", "ayproduction2005")]
    
    
    #clean: 
    rm(sch_dist, schdistlist)
   
    # Reshape the data frame
    dfleaid_fuel=reshape(prod2005shares, idvar = c("leaid"), timevar = "fuel", direction = "wide")
    #checked
    setwd("../../Derived Data")

    ####################### Fill in database: dist by bin by year 
    
    
    ###################################################
    
      df_test=fread("df_avg_noinst.csv", stringsAsFactors = FALSE, header=TRUE)
    
    
      ###################
      dfout=merge(df_test,dfleaid_fuel, by=c("leaid"),all=TRUE)

      dfout=merge(dfout,finalfuelagg, by=c("year"))
      
      dfout=merge(dfout,allyear,by=c("leaid","year"),all=TRUE)
      
      
      dfout=dfout %>% rename_with(~paste0(.,"_",distance,"km"), ayproduction2005.coal:ayproduction2005.renewable)
      
      
      fwrite(dfout,paste("dfbartik2005_",distance,"km.csv",sep=""))
  ##################################################################
}



preparedataset(40)

wd="C:/Users/mg5797/Documents/Pollution/ReplicationPackage"


preparedataset(60)